#install.packages("corrplot")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
library(magrittr) # syntaxe, notamment affectation %<>%
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(GGally) # plot pairs better than default plot
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly) # plots interactifs
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(flextable)
##
## Attaching package: 'flextable'
##
## The following objects are masked from 'package:plotly':
##
## highlight, style
##
## The following object is masked from 'package:purrr':
##
## compose
library(tibble)
library(e1071)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(keras)
library(tensorflow)
##
## Attaching package: 'tensorflow'
##
## The following object is masked from 'package:caret':
##
## train
read_csv("data/heart.csv")
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 918 × 12
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 40 M ATA 140 289 0 Normal 172
## 2 49 F NAP 160 180 0 Normal 156
## 3 37 M ATA 130 283 0 ST 98
## 4 48 F ASY 138 214 0 Normal 108
## 5 54 M NAP 150 195 0 Normal 122
## 6 39 M NAP 120 339 0 Normal 170
## 7 45 F ATA 130 237 0 Normal 170
## 8 54 M ATA 110 208 0 Normal 142
## 9 37 M ASY 140 207 0 Normal 130
## 10 48 F ATA 120 284 0 Normal 120
## # ℹ 908 more rows
## # ℹ 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>, ST_Slope <chr>,
## # HeartDisease <dbl>
tb <- read_csv("data/heart.csv") %>%
mutate_if(is.character, factor)
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tb)
## # A tibble: 6 × 12
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 40 M ATA 140 289 0 Normal 172
## 2 49 F NAP 160 180 0 Normal 156
## 3 37 M ATA 130 283 0 ST 98
## 4 48 F ASY 138 214 0 Normal 108
## 5 54 M NAP 150 195 0 Normal 122
## 6 39 M NAP 120 339 0 Normal 170
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <dbl>
tb$HeartDisease
## [1] 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1
## [38] 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0
## [75] 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0
## [112] 1 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0
## [149] 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0
## [186] 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 1
## [223] 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0
## [260] 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## [297] 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
## [334] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1
## [482] 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 1
## [519] 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0
## [556] 1 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0
## [593] 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 1
## [630] 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1
## [667] 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0
## [704] 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0
## [741] 0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 1
## [778] 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0
## [815] 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1
## [852] 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1
## [889] 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 0
c('Normale','Malade')[ tb$HeartDisease+1] %>% # tb$HeartDisease donne l'indice à aller chercher dans le vecteur ['Normale','Malade']
as.factor
## [1] Normale Malade Normale Malade Normale Normale Normale Normale Malade
## [10] Normale Normale Malade Normale Malade Normale Normale Malade Normale
## [19] Malade Malade Normale Normale Normale Malade Normale Normale Normale
## [28] Normale Normale Normale Malade Normale Malade Malade Normale Normale
## [37] Malade Normale Normale Normale Normale Malade Normale Normale Malade
## [46] Normale Normale Normale Normale Malade Malade Malade Normale Normale
## [55] Normale Normale Malade Malade Normale Malade Normale Normale Normale
## [64] Malade Normale Normale Normale Normale Malade Normale Malade Normale
## [73] Malade Normale Malade Normale Malade Normale Normale Malade Normale
## [82] Normale Malade Normale Malade Malade Malade Normale Malade Normale
## [91] Normale Normale Normale Malade Normale Malade Normale Normale Normale
## [100] Normale Malade Normale Malade Malade Malade Normale Normale Normale
## [109] Normale Normale Normale Malade Normale Normale Normale Malade Malade
## [118] Malade Normale Malade Malade Normale Normale Malade Normale Normale
## [127] Normale Normale Normale Normale Normale Malade Malade Malade Normale
## [136] Malade Normale Normale Malade Malade Malade Malade Malade Normale
## [145] Malade Normale Normale Normale Normale Malade Normale Normale Normale
## [154] Normale Normale Malade Malade Normale Malade Normale Malade Malade
## [163] Normale Normale Normale Malade Malade Normale Normale Normale Normale
## [172] Normale Normale Normale Malade Malade Malade Normale Normale Normale
## [181] Malade Normale Malade Normale Normale Malade Normale Malade Normale
## [190] Malade Normale Normale Normale Normale Normale Normale Normale Normale
## [199] Malade Normale Normale Normale Normale Normale Normale Normale Normale
## [208] Malade Normale Malade Malade Malade Normale Normale Malade Normale
## [217] Malade Normale Normale Normale Malade Malade Normale Normale Normale
## [226] Malade Normale Malade Normale Normale Normale Normale Normale Normale
## [235] Normale Normale Malade Malade Malade Malade Normale Malade Malade
## [244] Normale Malade Normale Malade Malade Malade Malade Malade Malade
## [253] Normale Normale Malade Normale Normale Normale Normale Normale Normale
## [262] Normale Malade Malade Malade Normale Malade Normale Malade Normale
## [271] Normale Normale Malade Normale Normale Normale Malade Malade Normale
## [280] Normale Normale Malade Normale Normale Normale Normale Normale Normale
## [289] Normale Normale Normale Normale Normale Malade Malade Malade Malade
## [298] Malade Malade Malade Malade Normale Malade Malade Malade Malade
## [307] Malade Normale Malade Malade Normale Malade Malade Malade Normale
## [316] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [325] Malade Malade Normale Malade Malade Malade Malade Malade Normale
## [334] Malade Malade Malade Normale Malade Malade Malade Malade Malade
## [343] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [352] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [361] Malade Malade Malade Malade Normale Malade Malade Malade Malade
## [370] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [379] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [388] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [397] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [406] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [415] Malade Malade Malade Normale Malade Malade Normale Normale Malade
## [424] Normale Malade Malade Normale Malade Malade Malade Malade Normale
## [433] Malade Malade Normale Normale Malade Malade Malade Normale Malade
## [442] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [451] Malade Malade Malade Normale Malade Normale Malade Malade Malade
## [460] Normale Malade Malade Malade Normale Malade Normale Malade Normale
## [469] Malade Normale Malade Malade Malade Malade Normale Malade Normale
## [478] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [487] Normale Malade Normale Malade Malade Malade Malade Malade Malade
## [496] Malade Normale Malade Malade Malade Malade Malade Malade Normale
## [505] Malade Malade Malade Normale Malade Malade Normale Malade Normale
## [514] Malade Malade Normale Malade Malade Malade Malade Normale Malade
## [523] Malade Malade Normale Normale Malade Normale Malade Malade Malade
## [532] Malade Malade Malade Malade Malade Malade Malade Malade Normale
## [541] Malade Malade Malade Malade Normale Normale Malade Malade Malade
## [550] Normale Malade Normale Malade Malade Normale Malade Normale Malade
## [559] Malade Malade Normale Normale Normale Malade Malade Malade Normale
## [568] Malade Malade Malade Malade Malade Malade Malade Malade Malade
## [577] Malade Malade Malade Malade Malade Malade Malade Normale Malade
## [586] Malade Malade Normale Malade Malade Normale Normale Malade Malade
## [595] Malade Malade Malade Normale Malade Malade Normale Malade Malade
## [604] Malade Normale Normale Malade Malade Malade Malade Malade Normale
## [613] Malade Normale Malade Malade Normale Malade Normale Normale Normale
## [622] Malade Malade Malade Malade Normale Normale Normale Malade Normale
## [631] Normale Malade Malade Normale Normale Malade Normale Normale Normale
## [640] Normale Normale Normale Normale Malade Normale Malade Normale Normale
## [649] Malade Malade Malade Malade Malade Normale Normale Malade Normale
## [658] Normale Normale Malade Normale Malade Malade Malade Malade Malade
## [667] Normale Normale Normale Normale Normale Malade Normale Malade Malade
## [676] Normale Malade Normale Normale Normale Malade Normale Malade Normale
## [685] Malade Malade Normale Normale Normale Normale Malade Normale Normale
## [694] Normale Normale Malade Malade Malade Normale Normale Normale Normale
## [703] Normale Normale Malade Normale Malade Malade Malade Malade Malade
## [712] Normale Malade Normale Normale Normale Malade Normale Malade Malade
## [721] Malade Normale Malade Malade Normale Malade Normale Malade Normale
## [730] Normale Normale Malade Malade Normale Malade Malade Malade Malade
## [739] Normale Normale Normale Malade Normale Normale Malade Malade Malade
## [748] Normale Malade Normale Normale Normale Malade Normale Normale Malade
## [757] Normale Malade Normale Malade Malade Malade Malade Malade Normale
## [766] Normale Normale Normale Normale Normale Normale Malade Normale Normale
## [775] Malade Malade Malade Normale Malade Normale Normale Normale Normale
## [784] Normale Malade Normale Malade Malade Normale Normale Malade Malade
## [793] Malade Malade Normale Normale Malade Malade Normale Normale Normale
## [802] Malade Normale Normale Malade Normale Malade Normale Malade Normale
## [811] Normale Normale Normale Normale Malade Normale Malade Malade Malade
## [820] Malade Normale Normale Normale Malade Normale Malade Normale Normale
## [829] Malade Normale Normale Normale Normale Normale Normale Malade Malade
## [838] Normale Malade Normale Normale Malade Malade Normale Normale Malade
## [847] Malade Normale Malade Normale Malade Normale Malade Normale Normale
## [856] Malade Normale Normale Malade Normale Malade Malade Normale Malade
## [865] Malade Malade Normale Malade Normale Normale Normale Normale Malade
## [874] Malade Normale Normale Malade Malade Normale Malade Normale Normale
## [883] Normale Normale Malade Normale Normale Malade Malade Malade Normale
## [892] Normale Normale Malade Normale Malade Normale Malade Normale Malade
## [901] Malade Malade Normale Normale Normale Malade Normale Malade Malade
## [910] Malade Normale Malade Malade Malade Malade Malade Malade Normale
## Levels: Malade Normale
Modification du tibble : fonction mutate
tb= tb %>%
mutate(HeartDisease=as.factor( c('Normale','Malade'))[ HeartDisease +1] )
tb$HeartDisease=as.factor(tb$HeartDisease)
Ceci permet de remplacer les factors 0,1 par normale et malade
sum(is.na(data))
## Warning in is.na(data): is.na() applied to non-(list or vector) of type
## 'closure'
## [1] 0
numerical_columns <- c("Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak", "HeartDisease")
tb_num= tb[numerical_columns]
tb_num
## # A tibble: 918 × 6
## Age RestingBP Cholesterol MaxHR Oldpeak HeartDisease
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 40 140 289 172 0 Normale
## 2 49 160 180 156 1 Malade
## 3 37 130 283 98 0 Normale
## 4 48 138 214 108 1.5 Malade
## 5 54 150 195 122 0 Normale
## 6 39 120 339 170 0 Normale
## 7 45 130 237 170 0 Normale
## 8 54 110 208 142 0 Normale
## 9 37 140 207 130 1.5 Malade
## 10 48 120 284 120 0 Normale
## # ℹ 908 more rows
En x les classes, et pour chaque classe, les effectifs, sous
forme de barplot :
tb %>%
ggplot(aes(x=HeartDisease, fill=HeartDisease)) + # aes pour aesthetic
geom_bar(stat = 'count') + # https://ggplot2.tidyverse.org/reference/geom_bar.html
geom_text(stat='count', aes(label=..count..), vjust=-0.5) # pour afficher les effectifs
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pivot_longer de tidyr
(plusieurs variables → (variable name, variable value) )Manip pour avoir des tuples (classe, variable, value)
tbg = tb_num %>%
rowid_to_column() %>% # ajout d'un identifiant de ligne
pivot_longer(Age:Oldpeak, names_to='variable', values_to='value')
tbg
## # A tibble: 4,590 × 4
## rowid HeartDisease variable value
## <int> <fct> <chr> <dbl>
## 1 1 Normale Age 40
## 2 1 Normale RestingBP 140
## 3 1 Normale Cholesterol 289
## 4 1 Normale MaxHR 172
## 5 1 Normale Oldpeak 0
## 6 2 Malade Age 49
## 7 2 Malade RestingBP 160
## 8 2 Malade Cholesterol 180
## 9 2 Malade MaxHR 156
## 10 2 Malade Oldpeak 1
## # ℹ 4,580 more rows
rowid sera nécessaire si on veut revenir au format de
départ sinon impossible de savoir quelle mesures (d’alcool, de ash, …)
il faut rassembler pour reconstruire le tableau de départ.
tbg %>%
ggplot(aes(HeartDisease, value, color=HeartDisease)) +
geom_boxplot() +
facet_wrap(~ variable)
Conclusion : On voit bien que Age et Oldpeak sont
écrasé par les autres Variables quantitatives. Il est donc pertinent de
normaliser
Plot interactif avec plotly (on sauvegarde le résultat
de ggplot et on le passe à ggplotly)
p = tbg %>%
ggplot(aes(HeartDisease, value, color=HeartDisease)) +
geom_boxplot() +
facet_wrap(~ variable)
ggplotly(p)
On enregistre les paramètres de normalisation (z-score) de chacune des variables :
znorm = tbg %>%
group_by(variable) %>%
summarize(mean=mean(value), sd=sd(value), min = min(value), max=max(value), median=median(value))
znorm
## # A tibble: 5 × 6
## variable mean sd min max median
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Age 53.5 9.43 28 77 54
## 2 Cholesterol 199. 109. 0 603 223
## 3 MaxHR 137. 25.5 60 202 138
## 4 Oldpeak 0.887 1.07 -2.6 6.2 0.6
## 5 RestingBP 132. 18.5 0 200 130
jointure et ajout des z-scores
tbg %<>%
inner_join(znorm, by='variable') %>%
mutate(value.z = (value-mean)/sd)
tbg
## # A tibble: 4,590 × 10
## rowid HeartDisease variable value mean sd min max median value.z
## <int> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Normale Age 40 53.5 9.43 28 77 54 -1.43
## 2 1 Normale RestingBP 140 132. 18.5 0 200 130 0.411
## 3 1 Normale Cholester… 289 199. 109. 0 603 223 0.825
## 4 1 Normale MaxHR 172 137. 25.5 60 202 138 1.38
## 5 1 Normale Oldpeak 0 0.887 1.07 -2.6 6.2 0.6 -0.832
## 6 2 Malade Age 49 53.5 9.43 28 77 54 -0.478
## 7 2 Malade RestingBP 160 132. 18.5 0 200 130 1.49
## 8 2 Malade Cholester… 180 199. 109. 0 603 223 -0.172
## 9 2 Malade MaxHR 156 137. 25.5 60 202 138 0.754
## 10 2 Malade Oldpeak 1 0.887 1.07 -2.6 6.2 0.6 0.106
## # ℹ 4,580 more rows
Vérification de la normalisation (moyenne à 0 et écart-type à 1)
tbg %>%
group_by(variable) %>%
summarize(moyenne=round(mean(value.z), 4), `écart-type`=sd(value.z)) # nom de colonne avec un é pour illustrer mais à éviter en général
## # A tibble: 5 × 3
## variable moyenne `écart-type`
## <chr> <dbl> <dbl>
## 1 Age 0 1
## 2 Cholesterol 0 1
## 3 MaxHR 0 1
## 4 Oldpeak 0 1
## 5 RestingBP 0 1
ggplot +
facetVisualisation
tbg %>%
ggplot(aes(HeartDisease, value.z, color=HeartDisease)) +
geom_violin() +
geom_jitter(alpha=.3, width=.15, size=0.5) +
facet_wrap(~ variable)
Conclusion : Maintenant les variables quantitatives
sont comparables.
# Normalisation des variables numériques avec scale
tb_numpur=tb[numerical_columns] %>%
select(-HeartDisease)
tb_numpur <- scale(tb_numpur)
colSums(tb_numpur)
## Age RestingBP Cholesterol MaxHR Oldpeak
## -1.107239e-13 1.838182e-13 1.890328e-14 4.727919e-13 -4.220582e-15
# Calcul de la matrice de corrélation
correlation_matrix <- cor(tb_numpur)
# Affichage de la matrice de corrélation
print(correlation_matrix)
## Age RestingBP Cholesterol MaxHR Oldpeak
## Age 1.00000000 0.2543994 -0.09528177 -0.3820447 0.25861154
## RestingBP 0.25439936 1.0000000 0.10089294 -0.1121350 0.16480304
## Cholesterol -0.09528177 0.1008929 1.00000000 0.2357924 0.05014811
## MaxHR -0.38204468 -0.1121350 0.23579240 1.0000000 -0.16069055
## Oldpeak 0.25861154 0.1648030 0.05014811 -0.1606906 1.00000000
# Visualisation de la corrélation
corrplot(correlation_matrix, method = "circle")
Conclusion : Si deux variables sont fortement corrélées
(|corr| > 0.8), l’une peut être supprimée car elles apportent la même
information. Mais ici, toutes les corrélations sont faibles, donc aucune
variable n’est totalement redondante et donc on garde nos toutes belles
variables hhh.
# Tester l'indépendance avec la variable cible (Cardiopathie) pour chaque variable catégorielle
tb$FastingBS= as.factor(tb$FastingBS)
categorical_columns <- c("Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope", "FastingBS")
for (col in categorical_columns) {
#cat("\nTest du chi-carré pour", col, ":\n")
test_result <- chisq.test(table(tb[[col]], tb$HeartDisease))
print(test_result)
}
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 84.145, df = 1, p-value < 2.2e-16
##
##
## Pearson's Chi-squared test
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 268.07, df = 3, p-value < 2.2e-16
##
##
## Pearson's Chi-squared test
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 10.931, df = 2, p-value = 0.004229
##
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 222.26, df = 1, p-value < 2.2e-16
##
##
## Pearson's Chi-squared test
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 355.92, df = 2, p-value < 2.2e-16
##
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(tb[[col]], tb$HeartDisease)
## X-squared = 64.321, df = 1, p-value = 1.057e-15
# La méthode de prof
tb_cat= tb[categorical_columns]
x= tb[categorical_columns]
y=tb$HeartDisease
sapply(colnames(x), function(a)chisq.test(x=x[,a], y=y)$p.value)
## Sex ChestPainType RestingECG ExerciseAngina ST_Slope
## 4.597617e-20 8.083728e-58 4.229233e-03 2.907808e-50 5.167638e-78
## FastingBS
## 1.057302e-15
Conclusion : p-value < 0.05 : Il y a une relation significative entre toutes les variables catégorielles et HeartDisease. Toutes les variables catégorielles sont informatives et doivent être conservées.
table(tb$HeartDisease, tb_cat$ChestPainType)
##
## ASY ATA NAP TA
## Malade 392 24 72 20
## Normale 104 149 131 26
plot(table(tb_cat$ChestPainType, tb$HeartDisease), main='ChestPainType')
tb %>%
ggbivariate(outcome="HeartDisease", explanatory= colnames(tb[categorical_columns]))
# Affichage
foo=sapply(colnames(x),
function(a)
plot(
table(unlist(x[,a]), y),
main=a
)
)
colonne_numerique <- c("Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak")
tb_final=tb
tb_final[colonne_numerique]= scale(tb_final[colonne_numerique])
tb_final
## # A tibble: 918 × 12
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl>
## 1 -1.43 M ATA 0.411 0.825 0 Normal 1.38
## 2 -0.478 F NAP 1.49 -0.172 0 Normal 0.754
## 3 -1.75 M ATA -0.129 0.770 0 ST -1.52
## 4 -0.584 F ASY 0.303 0.139 0 Normal -1.13
## 5 0.0519 M NAP 0.951 -0.0347 0 Normal -0.582
## 6 -1.54 M NAP -0.670 1.28 0 Normal 1.30
## 7 -0.902 F ATA -0.129 0.349 0 Normal 1.30
## 8 0.0519 M ATA -1.21 0.0841 0 Normal 0.204
## 9 -1.75 M ASY 0.411 0.0750 0 Normal -0.267
## 10 -0.584 F ATA -0.670 0.779 0 Normal -0.660
## # ℹ 908 more rows
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>
set.seed(42) # Pour garantir la reproductibilité
trainIndex <- createDataPartition(tb_final$HeartDisease, p = 2/3, list = FALSE)
trainData <- tb[trainIndex, ]
testData <- tb[-trainIndex, ]
# Vérification des dimensions des ensembles
dim(trainData)
## [1] 613 12
dim(testData)
## [1] 305 12
rf_model <- randomForest(HeartDisease ~ ., ntree = 50000, data = trainData, importance = TRUE) # Entraîner un modèle random forest sur l'ensemble d'entraînement
predictions <- predict(rf_model, newdata = testData) # Prédictions
confusionMatrix(predictions, testData$HeartDisease) # Matrice de confusion
## Confusion Matrix and Statistics
##
## Reference
## Prediction Malade Normale
## Malade 155 26
## Normale 14 110
##
## Accuracy : 0.8689
## 95% CI : (0.8257, 0.9046)
## No Information Rate : 0.5541
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7323
##
## Mcnemar's Test P-Value : 0.08199
##
## Sensitivity : 0.9172
## Specificity : 0.8088
## Pos Pred Value : 0.8564
## Neg Pred Value : 0.8871
## Prevalence : 0.5541
## Detection Rate : 0.5082
## Detection Prevalence : 0.5934
## Balanced Accuracy : 0.8630
##
## 'Positive' Class : Malade
##
table(predictions,testData$HeartDisease)
##
## predictions Malade Normale
## Malade 155 26
## Normale 14 110
importance(rf_model) # afficher l'importance des variables
## Malade Normale MeanDecreaseAccuracy MeanDecreaseGini
## Age 32.02809 76.86594 77.94278 22.901831
## Sex 138.78582 142.99520 190.51302 10.557129
## ChestPainType 228.96091 242.17487 317.95004 41.499691
## RestingBP 63.79699 10.34377 56.12416 22.173653
## Cholesterol 97.68881 129.15609 154.38563 31.523692
## FastingBS 116.72690 96.60586 146.15849 7.512203
## RestingECG 40.55291 12.46801 39.32065 7.868187
## MaxHR 138.99369 11.65900 128.30665 29.339153
## ExerciseAngina 122.64457 164.80044 200.43852 25.493103
## Oldpeak 71.14422 230.03367 221.77873 31.061201
## ST_Slope 336.02894 531.89618 582.27555 68.487545
varImpPlot(rf_model, main = "Importance des variables") # visualiser l'importance des variables
nb_model <- naiveBayes(HeartDisease ~ ., data = trainData)
nb_predictions <- predict(nb_model, newdata = testData)
confusionMatrix(nb_predictions, testData$HeartDisease)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Malade Normale
## Malade 155 23
## Normale 14 113
##
## Accuracy : 0.8787
## 95% CI : (0.8367, 0.9131)
## No Information Rate : 0.5541
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7529
##
## Mcnemar's Test P-Value : 0.1884
##
## Sensitivity : 0.9172
## Specificity : 0.8309
## Pos Pred Value : 0.8708
## Neg Pred Value : 0.8898
## Prevalence : 0.5541
## Detection Rate : 0.5082
## Detection Prevalence : 0.5836
## Balanced Accuracy : 0.8740
##
## 'Positive' Class : Malade
##
print(nb_model)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Malade Normale
## 0.5530179 0.4469821
##
## Conditional probabilities:
## Age
## Y [,1] [,2]
## Malade 55.63717 8.874598
## Normale 50.55474 9.661994
##
## Sex
## Y F M
## Malade 0.1091445 0.8908555
## Normale 0.3357664 0.6642336
##
## ChestPainType
## Y ASY ATA NAP TA
## Malade 0.77581121 0.04129794 0.13274336 0.05014749
## Normale 0.25182482 0.35036496 0.34671533 0.05109489
##
## RestingBP
## Y [,1] [,2]
## Malade 132.8584 19.63869
## Normale 130.0073 16.50064
##
## Cholesterol
## Y [,1] [,2]
## Malade 174.6755 127.0620
## Normale 225.9599 75.7364
##
## FastingBS
## Y 0 1
## Malade 0.6578171 0.3421829
## Normale 0.8905109 0.1094891
##
## RestingECG
## Y LVH Normal ST
## Malade 0.2153392 0.5575221 0.2271386
## Normale 0.1861314 0.6569343 0.1569343
##
## MaxHR
## Y [,1] [,2]
## Malade 129.0413 22.75802
## Normale 147.0000 22.74722
##
## ExerciseAngina
## Y N Y
## Malade 0.3834808 0.6165192
## Normale 0.8722628 0.1277372
##
## Oldpeak
## Y [,1] [,2]
## Malade 1.2351032 1.1172826
## Normale 0.4233577 0.7420655
##
## ST_Slope
## Y Down Flat Up
## Malade 0.10619469 0.73156342 0.16224189
## Normale 0.02189781 0.19343066 0.78467153
model_nb_laplace <- naiveBayes(HeartDisease ~ ., data = trainData, laplace = 1)
pred_nb_laplace <- predict(model_nb_laplace, testData)
confusionMatrix(pred_nb_laplace, testData$HeartDisease, positive = "Malade")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Malade Normale
## Malade 156 23
## Normale 13 113
##
## Accuracy : 0.882
## 95% CI : (0.8404, 0.9159)
## No Information Rate : 0.5541
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7594
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 0.9231
## Specificity : 0.8309
## Pos Pred Value : 0.8715
## Neg Pred Value : 0.8968
## Prevalence : 0.5541
## Detection Rate : 0.5115
## Detection Prevalence : 0.5869
## Balanced Accuracy : 0.8770
##
## 'Positive' Class : Malade
##
modele_tree <- rpart(HeartDisease ~ ., data = trainData, method = "class", parms = list(split = "gini"))
pred_tree <- predict(modele_tree, newdata = testData, type = "class") # Prédiction sur les données de test
mat_tree <- confusionMatrix(pred_tree, testData$HeartDisease, positive = "Normale")
print(mat_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Malade Normale
## Malade 153 32
## Normale 16 104
##
## Accuracy : 0.8426
## 95% CI : (0.7968, 0.8816)
## No Information Rate : 0.5541
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6778
##
## Mcnemar's Test P-Value : 0.03038
##
## Sensitivity : 0.7647
## Specificity : 0.9053
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.8270
## Prevalence : 0.4459
## Detection Rate : 0.3410
## Detection Prevalence : 0.3934
## Balanced Accuracy : 0.8350
##
## 'Positive' Class : Normale
##
rpart.plot(modele_tree, type = 2, extra = 104, fallen.leaves = TRUE)
tree_entropy <- rpart(HeartDisease ~ ., data = trainData,
method = "class",
parms = list(split = "information"),
control = rpart.control(cp = 0.01, maxdepth = 5))
pred_tree_entropy <- predict(tree_entropy, testData, type = "class")
confusionMatrix(pred_tree_entropy, testData$HeartDisease, positive = "Malade")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Malade Normale
## Malade 152 33
## Normale 17 103
##
## Accuracy : 0.8361
## 95% CI : (0.7896, 0.8758)
## No Information Rate : 0.5541
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6644
##
## Mcnemar's Test P-Value : 0.03389
##
## Sensitivity : 0.8994
## Specificity : 0.7574
## Pos Pred Value : 0.8216
## Neg Pred Value : 0.8583
## Prevalence : 0.5541
## Detection Rate : 0.4984
## Detection Prevalence : 0.6066
## Balanced Accuracy : 0.8284
##
## 'Positive' Class : Malade
##
Tableau des confusions avec ggplot2
mat_conf <- matrix(c(156, 30, 16, 104),
nrow = 2, byrow = TRUE,
dimnames = list("Classe réelle" = c("Malade", "Normale"),
"Classe prédite" = c("Malade", "Normale")))
df_long <- melt(mat_conf)
colnames(df_long) <- c("ClasseReelle", "ClassePredite", "Valeur")
df_long$ClasseReelle <- factor(df_long$ClasseReelle, levels = rev(c("Malade", "Normale")))
df_long$ClassePredite <- factor(df_long$ClassePredite, levels = c("Malade", "Normale"))
ggplot(df_long, aes(x = ClassePredite, y = ClasseReelle)) +
geom_tile(aes(fill = Valeur), color = "#f0f0f0", linewidth = 1.2) +
geom_text(aes(label = Valeur), color = "black", size = 6, fontface = "bold") +
scale_fill_gradient2(low = "#e0f7fa", mid = "#80deea", high = "#006064", midpoint = 100, guide = "none") +
labs(
title = "Matrice de confusion du modèle Arbre de Décision",
x = "Classe Prédite",
y = "Classe Réelle"
) +
theme_minimal(base_family = "Arial", base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold", color = "#006064"),
axis.text = element_text(face = "bold"),
panel.grid = element_blank()
)
Tableau de comparaison des modeles
# Les données
data <- tibble::tibble(
Modèles = c("Random Forest", "Naïve Bayes", "K-Nearest Neighbors", "Arbre de Décision"),
`Précision globale (Accuracy)` = c("86,6 %", "88,56 %", "87,02 %", "84,97 %"),
Sensibilité = c("89,53 %", "88,95 %", "87,22 %", "77,61 %"),
`Indice Kappa` = c("0,7267", "0,7682", "0,7355", "0,6778"),
`Balanced Accuracy` = c("86,19 %", "88,51 %", "86,96 %", "84,15 %")
)
# Création du tableau flextable
flextable(data) %>%
set_header_labels(
Modèles = "Modèles",
`Précision globale (Accuracy)` = "Précision globale (Accuracy)",
Sensibilité = "Sensibilité",
`Indice Kappa` = "Indice Kappa",
`Balanced Accuracy` = "Balanced Accuracy"
) %>%
bold(part = "header") %>%
fontsize(size = 12, part = "all") %>%
color(color = "black") %>%
bg(part = "header", bg = "#DDEBF7") %>%
autofit()
Modèles | Précision globale (Accuracy) | Sensibilité | Indice Kappa | Balanced Accuracy |
|---|---|---|---|---|
Random Forest | 86,6 % | 89,53 % | 0,7267 | 86,19 % |
Naïve Bayes | 88,56 % | 88,95 % | 0,7682 | 88,51 % |
K-Nearest Neighbors | 87,02 % | 87,22 % | 0,7355 | 86,96 % |
Arbre de Décision | 84,97 % | 77,61 % | 0,6778 | 84,15 % |